In this task we are asked to select 5 bookmakers to see if over/under 2.5 game result can be explained by the odds. I selected William Hill, 888sport, 188BET, SportigBet and Tipico. Below code shows the data preparation.
require(data.table)
require(anytime)
require(plotly)
require(plyr)
require(MASS)
require(imputeTS)
require(jpeg)
set.seed(12)
matches<-data.table(readRDS("df9b1196-e3cf-4cc7-9159-f236fe738215_matches.RDS"))
odds<-data.table(readRDS("df9b1196-e3cf-4cc7-9159-f236fe738215_odd_details.RDS"))
matches=unique(matches)
#Selecting Soccer Games
odds<-odds[matchId %in% matches$matchId]
#Renaming the over under bets based on their threshold
odds[oddtype=="over",oddtype:= paste("over_",as.character(totalhandicap))]
odds[oddtype=="under",oddtype:= paste("under_",as.character(totalhandicap))]
matches[,match_date:=anydate(date)]
matches[,match_time:=anytime(date)]
matches=matches[order(home,-match_time)]
matches[,c("match_date","date"):=NULL]
matches[,c("HomeGoals","AwayGoals"):=tstrsplit(score,':')]
matches$HomeGoals=as.numeric(matches$HomeGoals)
matches$AwayGoals=as.numeric(matches$AwayGoals)
matches[,TotalGoals:=HomeGoals+AwayGoals]
matches[,IsOver:=0]
matches[TotalGoals>2,IsOver:=1]
matches=matches[complete.cases(matches)]
matches[,homewin:=0]
#Finding out about who won 0 is draw, 1 is home win and 2 is away win
matches[HomeGoals>AwayGoals,homewin:=1]
matches[HomeGoals<AwayGoals,homewin:=2]
matches[,Year:=year(match_time)]
matches[,Month:=month(match_time)]
matches[,Weekday:=wday(match_time)]
matches[,Hour:=hour(match_time)]
#Getting the Final odds
odds_a=odds[order(matchId, oddtype,bookmaker,date)]
odds_a=odds_a[,list(odd=odd[.N]),
by=list(matchId,oddtype,bookmaker)]
odds_a_William=odds_a[bookmaker=='William Hill' ]
odds_a_888sport=odds_a[bookmaker=='888sport' ]
odds_a_188BET=odds_a[bookmaker=='188BET' ]
odds_a_Sportingbet=odds_a[bookmaker=='Sportingbet' ]
odds_a_Tipico=odds_a[bookmaker=='Tipico' ]
odds_a_wide_William=dcast(odds_a_William,
matchId~oddtype,
value.var='odd')
odds_a_wide_888sport=dcast(odds_a_888sport,
matchId~oddtype,
value.var='odd')
odds_a_wide_188BET=dcast(odds_a_188BET,
matchId~oddtype,
value.var='odd')
odds_a_wide_Sportingbet=dcast(odds_a_Sportingbet,
matchId~oddtype,
value.var='odd')
odds_a_wide_Tipico=dcast(odds_a_Tipico,
matchId~oddtype,
value.var='odd')
colnames(odds_a_wide_William)[2:23]=paste(colnames(odds_a_wide_William)[2:23],"_W")
colnames(odds_a_wide_888sport)[2:39]=paste(colnames(odds_a_wide_888sport)[2:39],"_8")
colnames(odds_a_wide_188BET)[2:37]=paste(colnames(odds_a_wide_188BET)[2:37],"_B")
colnames(odds_a_wide_Sportingbet)[2:25]=paste(colnames(odds_a_wide_Sportingbet)[2:25],"_S")
colnames(odds_a_wide_Tipico)[2:23]=paste(colnames(odds_a_wide_Tipico)[2:23],"_T")
merged_matches=merge(matches,odds_a_wide_888sport,by='matchId')
merged_matches=merge(merged_matches,odds_a_wide_William,by='matchId')
merged_matches=merge(merged_matches,odds_a_wide_188BET,by='matchId')
merged_matches=merge(merged_matches,odds_a_wide_Sportingbet,by='matchId')
merged_matches=merge(merged_matches,odds_a_wide_Tipico,by='matchId')
Over_Under<-merged_matches$IsOver
Home_win<-merged_matches$homewin
merged_matches_1=merged_matches[,c("leagueId","home","homewin","away","type","match_time","Year","Month","Weekday","Hour","score","HomeGoals","AwayGoals","TotalGoals","matchId","IsOver"):=NULL]
#Replacing NA's with column means
merged_matches_1=merged_matches_1[ ,lapply(.SD, na.mean)]
Before applying PCA, I scaled the data with its range. Then, I applied the PCA.
merged_matches_1=(merged_matches_1-min(merged_matches_1))/(max(merged_matches_1)-min(merged_matches_1))
pca_m=princomp(merged_matches_1)
summary(pca_m)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 0.3007892 0.1581031 0.10570677 0.08834488
## Proportion of Variance 0.5761426 0.1591791 0.07115594 0.04970137
## Cumulative Proportion 0.5761426 0.7353217 0.80647769 0.85617905
## Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 0.07894591 0.04924815 0.04525255 0.04071873
## Proportion of Variance 0.03968851 0.01544491 0.01304042 0.01055830
## Cumulative Proportion 0.89586756 0.91131248 0.92435290 0.93491120
## Comp.9 Comp.10 Comp.11 Comp.12
## Standard deviation 0.039320644 0.035823324 0.032489294 0.029178092
## Proportion of Variance 0.009845705 0.008172169 0.006721808 0.005421499
## Cumulative Proportion 0.944756906 0.952929075 0.959650884 0.965072382
## Comp.13 Comp.14 Comp.15 Comp.16
## Standard deviation 0.026013609 0.023075414 0.020732736 0.017328194
## Proportion of Variance 0.004309301 0.003390819 0.002737277 0.001912107
## Cumulative Proportion 0.969381683 0.972772502 0.975509779 0.977421886
## Comp.17 Comp.18 Comp.19 Comp.20
## Standard deviation 0.016841744 0.016463476 0.016041904 0.015212012
## Proportion of Variance 0.001806258 0.001726031 0.001638768 0.001473598
## Cumulative Proportion 0.979228144 0.980954175 0.982592943 0.984066541
## Comp.21 Comp.22 Comp.23 Comp.24
## Standard deviation 0.014230220 0.014032020 0.013624282 0.013366662
## Proportion of Variance 0.001289522 0.001253851 0.001182042 0.001137762
## Cumulative Proportion 0.985356063 0.986609914 0.987791956 0.988929718
## Comp.25 Comp.26 Comp.27 Comp.28
## Standard deviation 0.012606192 0.0122817307 0.0116032365 0.0111986748
## Proportion of Variance 0.001011983 0.0009605605 0.0008573613 0.0007986175
## Cumulative Proportion 0.989941701 0.9909022618 0.9917596231 0.9925582406
## Comp.29 Comp.30 Comp.31 Comp.32
## Standard deviation 0.0104162406 0.0101279968 0.0093183474 0.0087181837
## Proportion of Variance 0.0006909197 0.0006532098 0.0005529468 0.0004840137
## Cumulative Proportion 0.9932491603 0.9939023701 0.9944553170 0.9949393306
## Comp.33 Comp.34 Comp.35 Comp.36
## Standard deviation 0.0078070231 0.0076256506 0.0072447283 0.0069462065
## Proportion of Variance 0.0003881294 0.0003703048 0.0003342334 0.0003072564
## Cumulative Proportion 0.9953274600 0.9956977648 0.9960319982 0.9963392546
## Comp.37 Comp.38 Comp.39 Comp.40
## Standard deviation 0.006801062 0.0064614961 0.006266781 0.0060718329
## Proportion of Variance 0.000294550 0.0002658715 0.000250089 0.0002347714
## Cumulative Proportion 0.996633805 0.9968996761 0.997149765 0.9973845366
## Comp.41 Comp.42 Comp.43 Comp.44
## Standard deviation 0.0058622889 0.005479961 0.0050945806 0.0049374300
## Proportion of Variance 0.0002188467 0.000191232 0.0001652808 0.0001552414
## Cumulative Proportion 0.9976033833 0.997794615 0.9979598961 0.9981151375
## Comp.45 Comp.46 Comp.47 Comp.48
## Standard deviation 0.0046172576 0.0045218804 0.0044364377 0.0042413804
## Proportion of Variance 0.0001357606 0.0001302098 0.0001253356 0.0001145566
## Cumulative Proportion 0.9982508981 0.9983811079 0.9985064435 0.9986210001
## Comp.49 Comp.50 Comp.51 Comp.52
## Standard deviation 0.004103624 3.919743e-03 3.891632e-03 3.703321e-03
## Proportion of Variance 0.000107236 9.784097e-05 9.644266e-05 8.733498e-05
## Cumulative Proportion 0.998728236 9.988261e-01 9.989225e-01 9.990099e-01
## Comp.53 Comp.54 Comp.55 Comp.56
## Standard deviation 3.682669e-03 3.514170e-03 0.0033995795 3.256081e-03
## Proportion of Variance 8.636362e-05 7.864136e-05 0.0000735963 6.751435e-05
## Cumulative Proportion 9.990962e-01 9.991749e-01 0.9992484560 9.993160e-01
## Comp.57 Comp.58 Comp.59 Comp.60
## Standard deviation 0.0031502480 3.059198e-03 2.969977e-03 2.908556e-03
## Proportion of Variance 0.0000631968 5.959649e-05 5.617094e-05 5.387168e-05
## Cumulative Proportion 0.9993791671 9.994388e-01 9.994949e-01 9.995488e-01
## Comp.61 Comp.62 Comp.63 Comp.64
## Standard deviation 2.790180e-03 2.673921e-03 2.533314e-03 0.0024155973
## Proportion of Variance 4.957584e-05 4.553054e-05 4.086801e-05 0.0000371582
## Cumulative Proportion 9.995984e-01 9.996439e-01 9.996848e-01 0.9997219388
## Comp.65 Comp.66 Comp.67 Comp.68
## Standard deviation 2.302658e-03 0.0019398604 1.820301e-03 1.742125e-03
## Proportion of Variance 3.376483e-05 0.0000239633 2.110046e-05 1.932698e-05
## Cumulative Proportion 9.997557e-01 0.9997796669 9.998008e-01 9.998201e-01
## Comp.69 Comp.70 Comp.71 Comp.72
## Standard deviation 1.634792e-03 1.609001e-03 1.521542e-03 1.496466e-03
## Proportion of Variance 1.701887e-05 1.648611e-05 1.474259e-05 1.426066e-05
## Cumulative Proportion 9.998371e-01 9.998536e-01 9.998683e-01 9.998826e-01
## Comp.73 Comp.74 Comp.75 Comp.76
## Standard deviation 1.490756e-03 1.406883e-03 1.298971e-03 1.213208e-03
## Proportion of Variance 1.415203e-05 1.260438e-05 1.074496e-05 9.372938e-06
## Cumulative Proportion 9.998968e-01 9.999094e-01 9.999201e-01 9.999295e-01
## Comp.77 Comp.78 Comp.79 Comp.80
## Standard deviation 1.174271e-03 9.812767e-04 9.261688e-04 8.841621e-04
## Proportion of Variance 8.780961e-06 6.131808e-06 5.462430e-06 4.978166e-06
## Cumulative Proportion 9.999383e-01 9.999444e-01 9.999499e-01 9.999548e-01
## Comp.81 Comp.82 Comp.83 Comp.84
## Standard deviation 8.242017e-04 7.837303e-04 7.404076e-04 6.966572e-04
## Proportion of Variance 4.325861e-06 3.911460e-06 3.490980e-06 3.090607e-06
## Cumulative Proportion 9.999592e-01 9.999631e-01 9.999666e-01 9.999696e-01
## Comp.85 Comp.86 Comp.87 Comp.88
## Standard deviation 6.877674e-04 6.422308e-04 6.157956e-04 5.893212e-04
## Proportion of Variance 3.012234e-06 2.626563e-06 2.414787e-06 2.211616e-06
## Cumulative Proportion 9.999727e-01 9.999753e-01 9.999777e-01 9.999799e-01
## Comp.89 Comp.90 Comp.91 Comp.92
## Standard deviation 5.676149e-04 5.405344e-04 4.647236e-04 4.511995e-04
## Proportion of Variance 2.051697e-06 1.860597e-06 1.375292e-06 1.296411e-06
## Cumulative Proportion 9.999820e-01 9.999838e-01 9.999852e-01 9.999865e-01
## Comp.93 Comp.94 Comp.95 Comp.96
## Standard deviation 4.441513e-04 4.336323e-04 4.218790e-04 4.122193e-04
## Proportion of Variance 1.256225e-06 1.197426e-06 1.133395e-06 1.082087e-06
## Cumulative Proportion 9.999878e-01 9.999890e-01 9.999901e-01 9.999912e-01
## Comp.97 Comp.98 Comp.99 Comp.100
## Standard deviation 3.829616e-04 3.501240e-04 3.345040e-04 3.168224e-04
## Proportion of Variance 9.339336e-07 7.806373e-07 7.125382e-07 6.392008e-07
## Cumulative Proportion 9.999921e-01 9.999929e-01 9.999936e-01 9.999942e-01
## Comp.101 Comp.102 Comp.103 Comp.104
## Standard deviation 3.038974e-04 2.847233e-04 2.784526e-04 2.693517e-04
## Proportion of Variance 5.881112e-07 5.162399e-07 4.937511e-07 4.620034e-07
## Cumulative Proportion 9.999948e-01 9.999953e-01 9.999958e-01 9.999963e-01
## Comp.105 Comp.106 Comp.107 Comp.108
## Standard deviation 2.629394e-04 2.505095e-04 2.356833e-04 2.235829e-04
## Proportion of Variance 4.402677e-07 3.996262e-07 3.537229e-07 3.183338e-07
## Cumulative Proportion 9.999967e-01 9.999971e-01 9.999975e-01 9.999978e-01
## Comp.109 Comp.110 Comp.111 Comp.112
## Standard deviation 2.091962e-04 1.959772e-04 1.876953e-04 1.796528e-04
## Proportion of Variance 2.786847e-07 2.445776e-07 2.243430e-07 2.055292e-07
## Cumulative Proportion 9.999981e-01 9.999983e-01 9.999986e-01 9.999988e-01
## Comp.113 Comp.114 Comp.115 Comp.116
## Standard deviation 1.704274e-04 1.627868e-04 1.535437e-04 1.405270e-04
## Proportion of Variance 1.849628e-07 1.687501e-07 1.501307e-07 1.257549e-07
## Cumulative Proportion 9.999989e-01 9.999991e-01 9.999993e-01 9.999994e-01
## Comp.117 Comp.118 Comp.119 Comp.120
## Standard deviation 1.324027e-04 1.163779e-04 1.044384e-04 1.036092e-04
## Proportion of Variance 1.116348e-07 8.624757e-08 6.945860e-08 6.836002e-08
## Cumulative Proportion 9.999995e-01 9.999996e-01 9.999997e-01 9.999997e-01
## Comp.121 Comp.122 Comp.123 Comp.124
## Standard deviation 8.749634e-05 7.782679e-05 7.687877e-05 7.468309e-05
## Proportion of Variance 4.875120e-08 3.857126e-08 3.763730e-08 3.551813e-08
## Cumulative Proportion 9.999998e-01 9.999998e-01 9.999998e-01 9.999999e-01
## Comp.125 Comp.126 Comp.127 Comp.128
## Standard deviation 7.096385e-05 6.456865e-05 6.076409e-05 4.474025e-05
## Proportion of Variance 3.206859e-08 2.654905e-08 2.351254e-08 1.274683e-08
## Cumulative Proportion 9.999999e-01 9.999999e-01 1.000000e+00 1.000000e+00
## Comp.129 Comp.130 Comp.131 Comp.132
## Standard deviation 3.279707e-05 2.660972e-05 2.523517e-05 1.918304e-05
## Proportion of Variance 6.849763e-09 4.509060e-09 4.055254e-09 2.343368e-09
## Cumulative Proportion 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## Comp.133 Comp.134 Comp.135 Comp.136
## Standard deviation 1.139498e-05 1.087581e-05 8.571961e-06 7.778894e-06
## Proportion of Variance 8.268611e-10 7.532328e-10 4.679139e-10 3.853375e-10
## Cumulative Proportion 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## Comp.137 Comp.138 Comp.139 Comp.140
## Standard deviation 4.987844e-06 3.732313e-10 3.375250e-10 4.513394e-11
## Proportion of Variance 1.584278e-10 8.870776e-19 7.254667e-19 1.297216e-20
## Cumulative Proportion 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## Comp.141 Comp.142
## Standard deviation 2.745726e-11 5.352892e-12
## Proportion of Variance 4.800869e-21 1.824662e-22
## Cumulative Proportion 1.000000e+00 1.000000e+00
From the summary, we can see that first two components explain around 73% of the variance. So, with the below code I calcualte the new coordinates based on these two components and plot them with different colors based on the match over under results.
p<-pca_m$scores[,1:2]
p1<-data.table(cbind(Over_Under,p))
plot(p1$Comp.1,p1$Comp.2,col=p1$Over_Under+1)
Based on the plot, I can see no clear seperation between the over under results.
With the below code, I apply MDS to the data, first with euclidean distance and then with Manhattan distance. Data is already scaled so, I don’t have to worry about that.
euc_dist=dist(merged_matches_1)
mds1=cmdscale(euc_dist)
p2<-data.table(cbind(Over_Under,mds1))
plot(p2$V2,p2$V3,col=p2$Over_Under+1)
With the Euclidean distance, I can not see a clear seperation. Interestingly, the coordinates for MDS came out as the same coordinates from PCA multiplied by -1.
man_dist=dist(merged_matches_1,method="manhattan")
mds2=cmdscale(man_dist)
p3<-data.table(cbind(Over_Under,mds2))
plot(p3$V2,p3$V3,col=p3$Over_Under+1)
Manhattan distance gives us a similar shape as Euclidean distance. This time the coordinates are different from PCA. However, still there is no clear seperation between the over under game results.
This time, I check whether PCA can help with the seperation of draw, home wins and away wins. With the below code I generate a color coded graph.
p4<-data.table(cbind(Home_win,p))
plot(p4$Comp.1,p4$Comp.2,col=p4$Home_win+1)
Based on the plot, we can see a better seperation between the home wins and away wins ith draws being in between them. 6
With the “jpeg” package, I read a picture in R. For quickness sake, my picture was 400x400. First, I used rasterImage to print the picture in R. Then with image function, I plotted the three channels of the image.
pic<-readJPEG("hw2_pic.jpeg")
plot.new()
rasterImage(as.raster(pic),0,0,1,1)
par(mfrow=c(1,3))
image(pic[,,1])
image(pic[,,2])
image(pic[,,3])
With the below code, I added noise to the picture. Then, I scaled the picture back into the 0,1 bound. You can see the noisy image below. ,
n=400*400
pic[,,1]<-pic[,,1]+matrix(runif(n,min = 0, max = 0.1),ncol=400)
pic[,,2]<-pic[,,2]+matrix(runif(n,min = 0, max = 0.1),ncol=400)
pic[,,3]<-pic[,,3]+matrix(runif(n,min = 0, max = 0.1),ncol=400)
pic[,,1]<-(pic[,,1]-min(pic[,,1]))/(max(pic[,,1])-min(pic[,,1]))
pic[,,2]<-(pic[,,2]-min(pic[,,2]))/(max(pic[,,2])-min(pic[,,2]))
pic[,,3]<-(pic[,,3]-min(pic[,,3]))/(max(pic[,,3])-min(pic[,,3]))
plot.new()
par(mfrow=c(1,1))
rasterImage(as.raster(pic),0,0,1,1)
plot.new()
par(mfrow=c(1,3))
image(pic[,,1])
image(pic[,,2])
image(pic[,,3])
Greyscale conversion is done by adding all three channels together and then dividing this summation matrix by its maximum value.
graypic<-pic[,,1]+pic[,,2]+pic[,,3]
graypic<-graypic/max(graypic)
plot.new()
rasterImage(as.raster(graypic),0,0,1,1)
I ran two for loops inside one another to get the 3x3 patches. Since the grayscale picture is already scaled, I didn’t worry about scaling it.
for(i in 2:399){
for(j in 2:399){
e<-rbind(e,matrix(graypic[(i-1):(i+1),(j-1):(j+1)],ncol=1))
}
}
e<-matrix(e,nrow=9)
e<-t(e)
The PCA and image plotting is done below. I scaled all the image matrices so that the pixel values is between 0,1.
pca_g=princomp(e)
summary(pca_g)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 0.6517813 0.13004052 0.12455905 0.07238277
## Proportion of Variance 0.8919765 0.03550642 0.03257618 0.01100069
## Cumulative Proportion 0.8919765 0.92748290 0.96005908 0.97105978
## Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 0.068039264 0.062209051 0.048472971 0.04218271
## Proportion of Variance 0.009720056 0.008125624 0.004933429 0.00373610
## Cumulative Proportion 0.980779833 0.988905457 0.993838886 0.99757499
## Comp.9
## Standard deviation 0.033984614
## Proportion of Variance 0.002425014
## Cumulative Proportion 1.000000000
first_c<-matrix(pca_g$scores[,1],nrow=398)
first_c<-(first_c[,]-min(first_c))/(max(first_c)-min(first_c))
summary(pca_g)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 0.6517813 0.13004052 0.12455905 0.07238277
## Proportion of Variance 0.8919765 0.03550642 0.03257618 0.01100069
## Cumulative Proportion 0.8919765 0.92748290 0.96005908 0.97105978
## Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 0.068039264 0.062209051 0.048472971 0.04218271
## Proportion of Variance 0.009720056 0.008125624 0.004933429 0.00373610
## Cumulative Proportion 0.980779833 0.988905457 0.993838886 0.99757499
## Comp.9
## Standard deviation 0.033984614
## Proportion of Variance 0.002425014
## Cumulative Proportion 1.000000000
second_c<-matrix(pca_g$scores[,2],nrow=398)
second_c<-(second_c[,]-min(second_c))/(max(second_c)-min(second_c))
third_c<-matrix(pca_g$scores[,3],nrow=398)
third_c<-(third_c[,]-min(third_c))/(max(third_c)-min(third_c))
{plot(NA, xlim=c(0,11),ylim=c(0,8),type = "n", xaxt = "n", yaxt = "n", xlab = "", ylab = "")
rasterImage(as.raster(t(first_c)),0,2,3,6)
rasterImage(as.raster(t(second_c)),4,2,7,6)
rasterImage(as.raster(t(third_c)),8,2,11,6)}
ev1<-matrix(pca_g$loadings[,1],nrow=3)
ev1<-(ev1[,]-min(ev1))/(max(ev1)-min(ev1))
ev2<-matrix(pca_g$loadings[,2],nrow=3)
ev2<-(ev2[,]-min(ev2))/(max(ev2)-min(ev2))
ev3<-matrix(pca_g$loadings[,3],nrow=3)
ev3<-(ev3[,]-min(ev3))/(max(ev3)-min(ev3))
{plot(NA,xlim=c(0,11),ylim=c(0,8), type = "n", xaxt = "n", yaxt = "n", xlab = "", ylab = "")
rasterImage(as.raster(t(ev1)),0,2,3,6)
rasterImage(as.raster(t(ev2)),4,2,7,6)
rasterImage(as.raster(t(ev3)),8,2,11,6)}
The first component for the PCA seems to be the negative of the picture. The eigenvectors seems to show the intensity in the left, middle and right side of the picture.